# Tree Based Models
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/load_libraries.R')
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/f_partition.R')
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/regression_metrics.R')
whole_data<-f_partition(df=fread('/Users/ssobrinou/IE/Advanced/2019_Advanced/Datasets/data_automobile_ready.csv'),
test_proportion = 0.2,
seed = 872367823)
str(whole_data)
## List of 2
## $ train:Classes 'data.table' and 'data.frame': 156 obs. of 31 variables:
## ..$ fuel_gas : int [1:156] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ aspiration_turbo : int [1:156] 0 0 0 0 0 0 1 0 0 0 ...
## ..$ doors_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_two : int [1:156] 0 1 0 1 1 1 1 1 0 0 ...
## ..$ body_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ body_sedan : int [1:156] 1 0 0 0 0 0 0 1 1 1 ...
## ..$ body_wagon : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ drive_others : int [1:156] 0 0 0 0 0 0 0 0 1 0 ...
## ..$ drive_rwd : int [1:156] 0 0 0 1 1 0 1 1 0 0 ...
## ..$ engine_loc_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ wheel_base : num [1:156] 93.1 86.6 97.2 98.4 94.5 94.5 95.9 94.5 97 96.5 ...
## ..$ length : num [1:156] 167 145 173 176 169 ...
## ..$ width : num [1:156] 64.2 63.9 65.2 65.6 64 64 66.3 64 65.4 64 ...
## ..$ height : num [1:156] 54.1 50.8 54.7 52 52.6 51.4 50.2 52.6 54.3 54.5 ...
## ..$ weight : int [1:156] 1950 1819 2324 2714 2204 2221 2818 2169 2385 2010 ...
## ..$ engine_type_others: int [1:156] 0 0 0 0 0 0 0 0 1 0 ...
## ..$ cyl_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ cyl_six : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ engine_size : int [1:156] 91 92 120 146 98 109 156 98 108 92 ...
## ..$ fuel_sys_idi : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_mpfi : int [1:156] 0 0 0 1 0 1 0 0 0 0 ...
## ..$ fuel_sys_others : int [1:156] 0 1 0 0 0 0 1 0 0 1 ...
## ..$ bore : num [1:156] 3.08 2.91 3.33 3.62 3.19 3.19 3.59 3.19 3.62 2.91 ...
## ..$ stroke : num [1:156] 3.15 3.41 3.47 3.5 3.03 3.4 3.86 3.03 2.64 3.41 ...
## ..$ compr_ratio : num [1:156] 9 9.2 8.5 9.3 9 8.5 7 9 9 9.2 ...
## ..$ hp : int [1:156] 68 76 97 116 70 90 145 70 82 76 ...
## ..$ peak_rpm : int [1:156] 5000 6000 5200 4800 4800 5500 5000 4800 4800 6000 ...
## ..$ city_mpg : int [1:156] 31 31 27 24 29 24 19 29 24 30 ...
## ..$ high_mpg : int [1:156] 38 38 34 30 34 29 24 34 25 34 ...
## ..$ price : int [1:156] 7395 6855 8949 11549 8238 9980 12764 8058 9233 7295 ...
## ..$ make_agg_toyota : int [1:156] 0 0 0 1 1 0 0 1 0 0 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
## $ test :Classes 'data.table' and 'data.frame': 39 obs. of 31 variables:
## ..$ fuel_gas : int [1:39] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ aspiration_turbo : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_others : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_two : int [1:39] 0 0 0 1 1 0 0 1 1 0 ...
## ..$ body_others : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ body_sedan : int [1:39] 1 0 1 0 0 0 1 0 0 1 ...
## ..$ body_wagon : int [1:39] 0 1 0 0 0 0 0 0 0 0 ...
## ..$ drive_others : int [1:39] 1 0 0 0 0 0 0 0 0 0 ...
## ..$ drive_rwd : int [1:39] 0 0 1 0 0 0 0 0 0 0 ...
## ..$ engine_loc_others : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ wheel_base : num [1:39] 99.4 105.8 103.5 88.4 93.7 ...
## ..$ length : num [1:39] 177 193 189 141 157 ...
## ..$ width : num [1:39] 66.4 71.4 66.9 60.3 63.8 63.8 63.8 64 65.2 65.2 ...
## ..$ height : num [1:39] 54.3 55.7 55.7 53.2 50.8 50.6 50.6 52.6 53.3 54.1 ...
## ..$ weight : int [1:39] 2824 2954 3230 1488 1876 1967 1989 1940 2289 2304 ...
## ..$ engine_type_others: int [1:39] 0 0 0 1 0 0 0 0 0 0 ...
## ..$ cyl_others : int [1:39] 1 1 0 1 0 0 0 0 0 0 ...
## ..$ cyl_six : int [1:39] 0 0 1 0 0 0 0 0 0 0 ...
## ..$ engine_size : int [1:39] 136 136 209 61 90 90 90 92 110 110 ...
## ..$ fuel_sys_idi : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_mpfi : int [1:39] 1 1 1 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_others : int [1:39] 0 0 0 0 0 0 0 1 1 1 ...
## ..$ bore : num [1:39] 3.19 3.19 3.62 2.91 2.97 2.97 2.97 2.91 3.15 3.15 ...
## ..$ stroke : num [1:39] 3.4 3.4 3.39 3.03 3.23 3.23 3.23 3.41 3.58 3.58 ...
## ..$ compr_ratio : num [1:39] 8 8.5 8 9.5 9.4 9.4 9.4 9.2 9 9 ...
## ..$ hp : int [1:39] 115 110 182 48 68 68 68 76 86 86 ...
## ..$ peak_rpm : int [1:39] 5500 5500 5400 5100 5500 5500 5500 6000 5800 5800 ...
## ..$ city_mpg : int [1:39] 18 19 16 47 31 31 31 30 27 27 ...
## ..$ high_mpg : int [1:39] 22 25 22 53 38 38 38 34 33 33 ...
## ..$ price : int [1:39] 17450 18920 30760 5151 6377 6229 6692 6529 9095 8845 ...
## ..$ make_agg_toyota : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
whole_data<-lapply(whole_data, function(x){
return(x[, which(sapply(x, is.integer)):=lapply(.SD, as.numeric), .SDcols=sapply(x,is.integer)])
})
str(whole_data)
## List of 2
## $ train:Classes 'data.table' and 'data.frame': 156 obs. of 31 variables:
## ..$ fuel_gas : num [1:156] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ aspiration_turbo : num [1:156] 0 0 0 0 0 0 1 0 0 0 ...
## ..$ doors_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_two : num [1:156] 0 1 0 1 1 1 1 1 0 0 ...
## ..$ body_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ body_sedan : num [1:156] 1 0 0 0 0 0 0 1 1 1 ...
## ..$ body_wagon : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ drive_others : num [1:156] 0 0 0 0 0 0 0 0 1 0 ...
## ..$ drive_rwd : num [1:156] 0 0 0 1 1 0 1 1 0 0 ...
## ..$ engine_loc_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ wheel_base : num [1:156] 93.1 86.6 97.2 98.4 94.5 94.5 95.9 94.5 97 96.5 ...
## ..$ length : num [1:156] 167 145 173 176 169 ...
## ..$ width : num [1:156] 64.2 63.9 65.2 65.6 64 64 66.3 64 65.4 64 ...
## ..$ height : num [1:156] 54.1 50.8 54.7 52 52.6 51.4 50.2 52.6 54.3 54.5 ...
## ..$ weight : num [1:156] 1950 1819 2324 2714 2204 ...
## ..$ engine_type_others: num [1:156] 0 0 0 0 0 0 0 0 1 0 ...
## ..$ cyl_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ cyl_six : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ engine_size : num [1:156] 91 92 120 146 98 109 156 98 108 92 ...
## ..$ fuel_sys_idi : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_mpfi : num [1:156] 0 0 0 1 0 1 0 0 0 0 ...
## ..$ fuel_sys_others : num [1:156] 0 1 0 0 0 0 1 0 0 1 ...
## ..$ bore : num [1:156] 3.08 2.91 3.33 3.62 3.19 3.19 3.59 3.19 3.62 2.91 ...
## ..$ stroke : num [1:156] 3.15 3.41 3.47 3.5 3.03 3.4 3.86 3.03 2.64 3.41 ...
## ..$ compr_ratio : num [1:156] 9 9.2 8.5 9.3 9 8.5 7 9 9 9.2 ...
## ..$ hp : num [1:156] 68 76 97 116 70 90 145 70 82 76 ...
## ..$ peak_rpm : num [1:156] 5000 6000 5200 4800 4800 5500 5000 4800 4800 6000 ...
## ..$ city_mpg : num [1:156] 31 31 27 24 29 24 19 29 24 30 ...
## ..$ high_mpg : num [1:156] 38 38 34 30 34 29 24 34 25 34 ...
## ..$ price : num [1:156] 7395 6855 8949 11549 8238 ...
## ..$ make_agg_toyota : num [1:156] 0 0 0 1 1 0 0 1 0 0 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
## $ test :Classes 'data.table' and 'data.frame': 39 obs. of 31 variables:
## ..$ fuel_gas : num [1:39] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ aspiration_turbo : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_others : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_two : num [1:39] 0 0 0 1 1 0 0 1 1 0 ...
## ..$ body_others : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ body_sedan : num [1:39] 1 0 1 0 0 0 1 0 0 1 ...
## ..$ body_wagon : num [1:39] 0 1 0 0 0 0 0 0 0 0 ...
## ..$ drive_others : num [1:39] 1 0 0 0 0 0 0 0 0 0 ...
## ..$ drive_rwd : num [1:39] 0 0 1 0 0 0 0 0 0 0 ...
## ..$ engine_loc_others : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ wheel_base : num [1:39] 99.4 105.8 103.5 88.4 93.7 ...
## ..$ length : num [1:39] 177 193 189 141 157 ...
## ..$ width : num [1:39] 66.4 71.4 66.9 60.3 63.8 63.8 63.8 64 65.2 65.2 ...
## ..$ height : num [1:39] 54.3 55.7 55.7 53.2 50.8 50.6 50.6 52.6 53.3 54.1 ...
## ..$ weight : num [1:39] 2824 2954 3230 1488 1876 ...
## ..$ engine_type_others: num [1:39] 0 0 0 1 0 0 0 0 0 0 ...
## ..$ cyl_others : num [1:39] 1 1 0 1 0 0 0 0 0 0 ...
## ..$ cyl_six : num [1:39] 0 0 1 0 0 0 0 0 0 0 ...
## ..$ engine_size : num [1:39] 136 136 209 61 90 90 90 92 110 110 ...
## ..$ fuel_sys_idi : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_mpfi : num [1:39] 1 1 1 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_others : num [1:39] 0 0 0 0 0 0 0 1 1 1 ...
## ..$ bore : num [1:39] 3.19 3.19 3.62 2.91 2.97 2.97 2.97 2.91 3.15 3.15 ...
## ..$ stroke : num [1:39] 3.4 3.4 3.39 3.03 3.23 3.23 3.23 3.41 3.58 3.58 ...
## ..$ compr_ratio : num [1:39] 8 8.5 8 9.5 9.4 9.4 9.4 9.2 9 9 ...
## ..$ hp : num [1:39] 115 110 182 48 68 68 68 76 86 86 ...
## ..$ peak_rpm : num [1:39] 5500 5500 5400 5100 5500 5500 5500 6000 5800 5800 ...
## ..$ city_mpg : num [1:39] 18 19 16 47 31 31 31 30 27 27 ...
## ..$ high_mpg : num [1:39] 22 25 22 53 38 38 38 34 33 33 ...
## ..$ price : num [1:39] 17450 18920 30760 5151 6377 ...
## ..$ make_agg_toyota : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
# we start defining a formula
formula<-as.formula(price~.) # price against all other variables
#### 1.1 Base R Partitioning Tree
library(rpart)
library(rpart.plot)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
tree_0<-rpart(formula = formula, data = whole_data$train, method = 'anova', model=TRUE, cp=0)
print(tree_0)
## n= 156
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 156 9507260000 13064.040
## 2) engine_size< 182 146 3229339000 11427.030
## 4) weight< 2544 89 442823600 8395.393
## 8) length< 172.7 63 86829530 7401.762
## 16) weight< 2124 36 23223920 6694.861
## 32) weight< 1947 15 5064828 6093.800 *
## 33) weight>=1947 21 8869173 7124.190
## 66) hp< 68.5 8 3449536 6596.000 *
## 67) hp>=68.5 13 1814290 7449.231 *
## 17) weight>=2124 27 21630080 8344.296
## 34) hp< 89 19 9388133 8031.526 *
## 35) hp>=89 8 5968915 9087.125 *
## 9) length>=172.7 26 143078300 10803.040
## 18) peak_rpm< 5350 19 17366900 9743.421 *
## 19) peak_rpm>=5350 7 46474610 13679.140 *
## 5) weight>=2544 57 691334500 16160.630
## 10) width< 68.6 50 491555400 15542.020
## 20) hp< 118 27 142082300 13943.780
## 40) weight< 2923.5 15 68462070 13016.530 *
## 41) weight>=2923.5 12 44602590 15102.830 *
## 21) hp>=118 23 199542200 17418.220
## 42) stroke>=3.31 10 21436830 14821.100 *
## 43) stroke< 3.31 13 58770460 19416.000 *
## 11) width>=68.6 7 43973520 20579.290 *
## 3) engine_size>=182 10 174348500 36964.500 *
print(as.party(tree_0))
##
## Model formula:
## price ~ fuel_gas + aspiration_turbo + doors_others + doors_two +
## body_others + body_sedan + body_wagon + drive_others + drive_rwd +
## engine_loc_others + wheel_base + length + width + height +
## weight + engine_type_others + cyl_others + cyl_six + engine_size +
## fuel_sys_idi + fuel_sys_mpfi + fuel_sys_others + bore + stroke +
## compr_ratio + hp + peak_rpm + city_mpg + high_mpg + make_agg_toyota
##
## Fitted party:
## [1] root
## | [2] engine_size < 182
## | | [3] weight < 2544
## | | | [4] length < 172.7
## | | | | [5] weight < 2124
## | | | | | [6] weight < 1947: 6093.800 (n = 15, err = 5064828.4)
## | | | | | [7] weight >= 1947
## | | | | | | [8] hp < 68.5: 6596.000 (n = 8, err = 3449536.0)
## | | | | | | [9] hp >= 68.5: 7449.231 (n = 13, err = 1814290.3)
## | | | | [10] weight >= 2124
## | | | | | [11] hp < 89: 8031.526 (n = 19, err = 9388132.7)
## | | | | | [12] hp >= 89: 9087.125 (n = 8, err = 5968914.9)
## | | | [13] length >= 172.7
## | | | | [14] peak_rpm < 5350: 9743.421 (n = 19, err = 17366898.6)
## | | | | [15] peak_rpm >= 5350: 13679.143 (n = 7, err = 46474610.9)
## | | [16] weight >= 2544
## | | | [17] width < 68.6
## | | | | [18] hp < 118
## | | | | | [19] weight < 2923.5: 13016.533 (n = 15, err = 68462073.7)
## | | | | | [20] weight >= 2923.5: 15102.833 (n = 12, err = 44602589.7)
## | | | | [21] hp >= 118
## | | | | | [22] stroke >= 3.31: 14821.100 (n = 10, err = 21436830.9)
## | | | | | [23] stroke < 3.31: 19416.000 (n = 13, err = 58770458.0)
## | | | [24] width >= 68.6: 20579.286 (n = 7, err = 43973521.4)
## | [25] engine_size >= 182: 36964.500 (n = 10, err = 174348546.5)
##
## Number of inner nodes: 12
## Number of terminal nodes: 13
objects(tree_0)
## [1] "call" "control" "cptable"
## [4] "frame" "functions" "method"
## [7] "model" "numresp" "ordered"
## [10] "parms" "splits" "terms"
## [13] "variable.importance" "where"
tree_0$frame
## var n wt dev yval complexity ncompete
## 1 engine_size 156 156 9507260033 13064.045 0.6419906639 4
## 2 weight 146 146 3229339306 11427.027 0.2203769724 4
## 4 length 89 89 442823613 8395.393 0.0223950693 4
## 8 weight 63 63 86829529 7401.762 0.0044151031 4
## 16 weight 36 36 23223916 6694.861 0.0009771390 4
## 32 <leaf> 15 15 5064828 6093.800 0.0000000000 0
## 33 hp 21 21 8869173 7124.190 0.0003792204 4
## 66 <leaf> 8 8 3449536 6596.000 0.0000000000 0
## 67 <leaf> 13 13 1814290 7449.231 0.0000000000 0
## 17 hp 27 27 21630080 8344.296 0.0006598149 4
## 34 <leaf> 19 19 9388133 8031.526 0.0000000000 0
## 35 <leaf> 8 8 5968915 9087.125 0.0000000000 0
## 9 peak_rpm 26 26 143078337 10803.038 0.0083343495 4
## 18 <leaf> 19 19 17366899 9743.421 0.0000000000 0
## 19 <leaf> 7 7 46474611 13679.143 0.0000000000 0
## 5 width 57 57 691334511 16160.632 0.0163880597 4
## 10 hp 50 50 491555445 15542.020 0.0157701476 4
## 20 weight 27 27 142082315 13943.778 0.0030521571 4
## 40 <leaf> 15 15 68462074 13016.533 0.0000000000 0
## 41 <leaf> 12 12 44602590 15102.833 0.0000000000 0
## 21 stroke 23 23 199542236 17418.217 0.0125519810 4
## 42 <leaf> 10 10 21436831 14821.100 0.0000000000 0
## 43 <leaf> 13 13 58770458 19416.000 0.0000000000 0
## 11 <leaf> 7 7 43973521 20579.286 0.0000000000 0
## 3 <leaf> 10 10 174348546 36964.500 0.0000000000 0
## nsurrogate
## 1 5
## 2 5
## 4 5
## 8 5
## 16 5
## 32 0
## 33 5
## 66 0
## 67 0
## 17 5
## 34 0
## 35 0
## 9 5
## 18 0
## 19 0
## 5 2
## 10 5
## 20 5
## 40 0
## 41 0
## 21 5
## 42 0
## 43 0
## 11 0
## 3 0
tree_0$control
## $minsplit
## [1] 20
##
## $minbucket
## [1] 7
##
## $cp
## [1] 0
##
## $maxcompete
## [1] 4
##
## $maxsurrogate
## [1] 5
##
## $usesurrogate
## [1] 2
##
## $surrogatestyle
## [1] 0
##
## $maxdepth
## [1] 30
##
## $xval
## [1] 10
tree_0$variable.importance
## engine_size weight hp city_mpg
## 7956375152 6595368016 5939116695 5820053825
## width length high_mpg drive_rwd
## 3402342998 3313757603 1770753309 1323272325
## bore wheel_base height stroke
## 226203752 199613450 162772363 121137620
## peak_rpm cyl_six fuel_sys_mpfi fuel_sys_others
## 83157472 71706080 59667474 59667474
## cyl_others compr_ratio doors_two aspiration_turbo
## 22257935 7017283 3096638 2352387
tree_0$cptable
## CP nsplit rel error xerror xstd
## 1 0.6419906639 0 1.00000000 1.0061701 0.19634783
## 2 0.2203769724 1 0.35800934 0.4096693 0.05526871
## 3 0.0223950693 2 0.13763236 0.1966299 0.03396699
## 4 0.0163880597 3 0.11523729 0.1784056 0.03210153
## 5 0.0157701476 4 0.09884923 0.1801067 0.02947649
## 6 0.0125519810 5 0.08307909 0.1762945 0.02932287
## 7 0.0083343495 6 0.07052711 0.1539249 0.02509774
## 8 0.0044151031 7 0.06219276 0.1418581 0.02354036
## 9 0.0030521571 8 0.05777765 0.1393956 0.02332931
## 10 0.0009771390 9 0.05472550 0.1373463 0.02303610
## 11 0.0006598149 10 0.05374836 0.1361763 0.02306136
## 12 0.0003792204 11 0.05308854 0.1354099 0.02308686
## 13 0.0000000000 12 0.05270932 0.1347064 0.02310826
plot(tree_0$cptable, type='b'); grid()

prune(tree_0, cp=max(tree_0$cptable[,'CP']))
## n= 156
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 156 9507260000 13064.04 *
prune(tree_0, cp=min(tree_0$cptable[,'CP']))
## n= 156
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 156 9507260000 13064.040
## 2) engine_size< 182 146 3229339000 11427.030
## 4) weight< 2544 89 442823600 8395.393
## 8) length< 172.7 63 86829530 7401.762
## 16) weight< 2124 36 23223920 6694.861
## 32) weight< 1947 15 5064828 6093.800 *
## 33) weight>=1947 21 8869173 7124.190
## 66) hp< 68.5 8 3449536 6596.000 *
## 67) hp>=68.5 13 1814290 7449.231 *
## 17) weight>=2124 27 21630080 8344.296
## 34) hp< 89 19 9388133 8031.526 *
## 35) hp>=89 8 5968915 9087.125 *
## 9) length>=172.7 26 143078300 10803.040
## 18) peak_rpm< 5350 19 17366900 9743.421 *
## 19) peak_rpm>=5350 7 46474610 13679.140 *
## 5) weight>=2544 57 691334500 16160.630
## 10) width< 68.6 50 491555400 15542.020
## 20) hp< 118 27 142082300 13943.780
## 40) weight< 2923.5 15 68462070 13016.530 *
## 41) weight>=2923.5 12 44602590 15102.830 *
## 21) hp>=118 23 199542200 17418.220
## 42) stroke>=3.31 10 21436830 14821.100 *
## 43) stroke< 3.31 13 58770460 19416.000 *
## 11) width>=68.6 7 43973520 20579.290 *
## 3) engine_size>=182 10 174348500 36964.500 *
# plotting the tree
# basic plot
plot(tree_0, uniform = T,branch=0.5,compress = T)
text(tree_0, cex=0.75)

# rpart.plot from the rpart.plot library
rpart.plot(tree_0,fallen.leaves = F)

rpart.plot(tree_0,fallen.leaves = T,box.palette = 'Gn')

# interactive tree plot
library(visNetwork)
visTree(tree_0)
visTree(tree_0, fallenLeaves = T,
edgesFontSize = 14,
nodesFontSize = 16,
legend = T,
colorVar = RColorBrewer::brewer.pal(12,'Paired'),
colorEdges = 'darkgray',
colorY = c('palegreen','tomato'),
main='Regression Tree',
tooltipDelay = 0.001,
digits=0,
minNodeSize=10,
highlightNearest = list(enabled = TRUE, hover = TRUE, algorithm = "hierarchical"),
collapse = list(enabled = TRUE, fit = TRUE, resetHighlight = TRUE,
clusterOptions = list(fixed = F, physics = F)),
nodesPopSize=T,
edgesFontAlign = "horizontal")
# let's generate an NA in engine_size
whole_data$train_2<-copy(whole_data$train)
whole_data$train_2[27][['engine_size']]<-NA
sum(is.na(whole_data$train))
## [1] 0
sum(is.na(whole_data$train_2))
## [1] 1
tree_1<-rpart(formula = formula, data = whole_data$train_2, method = 'anova', model=TRUE, cp=0)
summary(tree_1)[[1]]
## Call:
## rpart(formula = formula, data = whole_data$train_2, method = "anova",
## model = TRUE, cp = 0)
## n= 156
##
## CP nsplit rel error xerror xstd
## 1 0.6419906639 0 1.00000000 1.0100025 0.19821010
## 2 0.2203769724 1 0.35800934 0.4066895 0.05483298
## 3 0.0223950693 2 0.13763236 0.1762818 0.03182151
## 4 0.0163880597 3 0.11523729 0.1639441 0.03109905
## 5 0.0157701476 4 0.09884923 0.1503590 0.02296524
## 6 0.0125519810 5 0.08307909 0.1543844 0.02286437
## 7 0.0083343495 6 0.07052711 0.1507823 0.02255946
## 8 0.0044151031 7 0.06219276 0.1403067 0.02153874
## 9 0.0030521571 8 0.05777765 0.1392559 0.02165782
## 10 0.0009771390 9 0.05472550 0.1311765 0.02047493
## 11 0.0006598149 10 0.05374836 0.1296058 0.02041857
## 12 0.0003792204 11 0.05308854 0.1299522 0.02040772
## 13 0.0000000000 12 0.05270932 0.1294686 0.02042537
##
## Variable importance
## engine_size weight hp city_mpg width length
## 21 18 16 16 9 9
## high_mpg drive_rwd bore wheel_base
## 5 4 1 1
##
## Node number 1: 156 observations, complexity param=0.6419907
## mean=13064.04, MSE=6.094397e+07
## left son=2 (146 obs) right son=3 (10 obs)
## Primary splits:
## engine_size < 182 to the left, improve=0.6403823, (1 missing)
## city_mpg < 17.5 to the right, improve=0.5479412, (0 missing)
## hp < 175.5 to the left, improve=0.5262288, (0 missing)
## weight < 2697.5 to the left, improve=0.5036809, (0 missing)
## high_mpg < 28.5 to the right, improve=0.4750277, (0 missing)
## Surrogate splits:
## weight < 3495 to the left, agree=0.981, adj=0.7, (1 split)
## hp < 175.5 to the left, agree=0.981, adj=0.7, (0 split)
## city_mpg < 16.5 to the right, agree=0.981, adj=0.7, (0 split)
## length < 199.05 to the left, agree=0.968, adj=0.5, (0 split)
## width < 69.25 to the left, agree=0.968, adj=0.5, (0 split)
##
## Node number 2: 146 observations, complexity param=0.220377
## mean=11427.03, MSE=2.211876e+07
## left son=4 (89 obs) right son=5 (57 obs)
## Primary splits:
## weight < 2544 to the left, improve=0.6487956, (0 missing)
## high_mpg < 28.5 to the right, improve=0.5943413, (0 missing)
## engine_size < 126 to the left, improve=0.5649050, (1 missing)
## hp < 94.5 to the left, improve=0.5402361, (0 missing)
## city_mpg < 23.5 to the right, improve=0.4989328, (0 missing)
## Surrogate splits:
## high_mpg < 28.5 to the right, agree=0.911, adj=0.772, (0 split)
## engine_size < 126 to the left, agree=0.890, adj=0.719, (0 split)
## hp < 104 to the left, agree=0.877, adj=0.684, (0 split)
## city_mpg < 22 to the right, agree=0.863, adj=0.649, (0 split)
## drive_rwd < 0.5 to the left, agree=0.856, adj=0.632, (0 split)
##
## Node number 3: 10 observations
## mean=36964.5, MSE=1.743485e+07
##
## Node number 4: 89 observations, complexity param=0.02239507
## mean=8395.393, MSE=4975546
## left son=8 (63 obs) right son=9 (26 obs)
## Primary splits:
## length < 172.7 to the left, improve=0.4808139, (0 missing)
## weight < 2287.5 to the left, improve=0.4666525, (0 missing)
## wheel_base < 98.6 to the left, improve=0.4051903, (0 missing)
## width < 64.5 to the left, improve=0.4023945, (0 missing)
## hp < 83 to the left, improve=0.3833154, (0 missing)
## Surrogate splits:
## wheel_base < 97.85 to the left, agree=0.921, adj=0.731, (0 split)
## weight < 2301 to the left, agree=0.910, adj=0.692, (0 split)
## width < 65.55 to the left, agree=0.876, adj=0.577, (0 split)
## engine_size < 115.5 to the left, agree=0.876, adj=0.577, (0 split)
## bore < 3.29 to the left, agree=0.831, adj=0.423, (0 split)
##
## Node number 5: 57 observations, complexity param=0.01638806
## mean=16160.63, MSE=1.212868e+07
## left son=10 (50 obs) right son=11 (7 obs)
## Primary splits:
## width < 68.6 to the left, improve=0.2253693, (0 missing)
## hp < 118 to the left, improve=0.2057451, (0 missing)
## cyl_six < 0.5 to the left, improve=0.1899323, (0 missing)
## wheel_base < 100.8 to the left, improve=0.1879356, (0 missing)
## weight < 2697.5 to the left, improve=0.1791694, (0 missing)
## Surrogate splits:
## wheel_base < 108.55 to the left, agree=0.895, adj=0.143, (0 split)
## cyl_others < 0.5 to the left, agree=0.895, adj=0.143, (0 split)
##
## Node number 8: 63 observations, complexity param=0.004415103
## mean=7401.762, MSE=1378246
## left son=16 (36 obs) right son=17 (27 obs)
## Primary splits:
## weight < 2124 to the left, improve=0.4834246, (0 missing)
## engine_size < 94.5 to the left, improve=0.4310051, (1 missing)
## length < 165.45 to the left, improve=0.3917216, (0 missing)
## city_mpg < 29.5 to the right, improve=0.3752696, (0 missing)
## hp < 80 to the left, improve=0.3258077, (0 missing)
## Surrogate splits:
## engine_size < 97.5 to the left, agree=0.889, adj=0.741, (0 split)
## city_mpg < 29.5 to the right, agree=0.889, adj=0.741, (0 split)
## hp < 77 to the left, agree=0.857, adj=0.667, (0 split)
## high_mpg < 36.5 to the right, agree=0.841, adj=0.630, (0 split)
## length < 165.65 to the left, agree=0.825, adj=0.593, (0 split)
##
## Node number 9: 26 observations, complexity param=0.008334349
## mean=10803.04, MSE=5503013
## left son=18 (19 obs) right son=19 (7 obs)
## Primary splits:
## peak_rpm < 5350 to the left, improve=0.5538003, (0 missing)
## hp < 99 to the left, improve=0.3699824, (0 missing)
## engine_size < 115 to the right, improve=0.2830159, (0 missing)
## fuel_sys_mpfi < 0.5 to the left, improve=0.2509445, (0 missing)
## length < 176.4 to the left, improve=0.2469144, (0 missing)
## Surrogate splits:
## bore < 3.23 to the right, agree=0.923, adj=0.714, (0 split)
## engine_size < 115 to the right, agree=0.885, adj=0.571, (0 split)
## hp < 99 to the left, agree=0.885, adj=0.571, (0 split)
## city_mpg < 23.5 to the right, agree=0.885, adj=0.571, (0 split)
## high_mpg < 29.5 to the right, agree=0.885, adj=0.571, (0 split)
##
## Node number 10: 50 observations, complexity param=0.01577015
## mean=15542.02, MSE=9831109
## left son=20 (27 obs) right son=21 (23 obs)
## Primary splits:
## hp < 118 to the left, improve=0.3050132, (0 missing)
## engine_size < 162.5 to the left, improve=0.2531597, (0 missing)
## cyl_six < 0.5 to the left, improve=0.2348309, (0 missing)
## weight < 2697.5 to the left, improve=0.1646924, (0 missing)
## peak_rpm < 4375 to the right, improve=0.1423281, (0 missing)
## Surrogate splits:
## city_mpg < 20.5 to the right, agree=0.86, adj=0.696, (0 split)
## engine_size < 154 to the left, agree=0.84, adj=0.652, (0 split)
## cyl_six < 0.5 to the left, agree=0.76, adj=0.478, (0 split)
## high_mpg < 26.5 to the right, agree=0.76, adj=0.478, (0 split)
## height < 54.85 to the right, agree=0.74, adj=0.435, (0 split)
##
## Node number 11: 7 observations
## mean=20579.29, MSE=6281932
##
## Node number 16: 36 observations, complexity param=0.000977139
## mean=6694.861, MSE=645108.8
## left son=32 (15 obs) right son=33 (21 obs)
## Primary splits:
## weight < 1947 to the left, improve=0.4000150, (0 missing)
## hp < 68.5 to the left, improve=0.3353818, (0 missing)
## length < 160.75 to the left, improve=0.3168454, (0 missing)
## bore < 3.065 to the left, improve=0.3050193, (0 missing)
## engine_size < 94.5 to the left, improve=0.2602266, (0 missing)
## Surrogate splits:
## city_mpg < 36 to the right, agree=0.750, adj=0.400, (0 split)
## high_mpg < 40 to the right, agree=0.750, adj=0.400, (0 split)
## doors_two < 0.5 to the right, agree=0.722, adj=0.333, (0 split)
## bore < 3.04 to the left, agree=0.722, adj=0.333, (0 split)
## compr_ratio < 9.405 to the right, agree=0.722, adj=0.333, (0 split)
##
## Node number 17: 27 observations, complexity param=0.0006598149
## mean=8344.296, MSE=801114.1
## left son=34 (19 obs) right son=35 (8 obs)
## Primary splits:
## hp < 89 to the left, improve=0.2900143, (0 missing)
## fuel_sys_mpfi < 0.5 to the left, improve=0.1561566, (0 missing)
## peak_rpm < 5100 to the left, improve=0.1321882, (0 missing)
## weight < 2287.5 to the left, improve=0.1298636, (0 missing)
## bore < 3.23 to the right, improve=0.1232483, (0 missing)
## Surrogate splits:
## compr_ratio < 8.05 to the right, agree=0.889, adj=0.625, (0 split)
## peak_rpm < 5375 to the left, agree=0.889, adj=0.625, (0 split)
## city_mpg < 24.5 to the right, agree=0.852, adj=0.500, (0 split)
## high_mpg < 30.5 to the right, agree=0.852, adj=0.500, (0 split)
## aspiration_turbo < 0.5 to the left, agree=0.815, adj=0.375, (0 split)
##
## Node number 18: 19 observations
## mean=9743.421, MSE=914047.3
##
## Node number 19: 7 observations
## mean=13679.14, MSE=6639230
##
## Node number 20: 27 observations, complexity param=0.003052157
## mean=13943.78, MSE=5262308
## left son=40 (15 obs) right son=41 (12 obs)
## Primary splits:
## weight < 2923.5 to the left, improve=0.20423130, (0 missing)
## wheel_base < 102.35 to the left, improve=0.19255060, (0 missing)
## compr_ratio < 9.405 to the left, improve=0.17862940, (0 missing)
## hp < 96 to the right, improve=0.09875490, (0 missing)
## length < 186.65 to the left, improve=0.08896211, (0 missing)
## Surrogate splits:
## length < 186.65 to the left, agree=0.926, adj=0.833, (0 split)
## width < 67.05 to the left, agree=0.926, adj=0.833, (0 split)
## height < 56.15 to the left, agree=0.926, adj=0.833, (0 split)
## wheel_base < 102.35 to the left, agree=0.889, adj=0.750, (0 split)
## bore < 3.66 to the left, agree=0.778, adj=0.500, (0 split)
##
## Node number 21: 23 observations, complexity param=0.01255198
## mean=17418.22, MSE=8675749
## left son=42 (10 obs) right son=43 (13 obs)
## Primary splits:
## stroke < 3.31 to the right, improve=0.5980435, (0 missing)
## high_mpg < 24.5 to the left, improve=0.3815231, (0 missing)
## height < 54.2 to the left, improve=0.3073280, (0 missing)
## compr_ratio < 7.65 to the left, improve=0.2493117, (0 missing)
## body_sedan < 0.5 to the left, improve=0.2394175, (0 missing)
## Surrogate splits:
## height < 54.2 to the left, agree=0.826, adj=0.6, (0 split)
## fuel_sys_mpfi < 0.5 to the left, agree=0.783, adj=0.5, (0 split)
## fuel_sys_others < 0.5 to the right, agree=0.783, adj=0.5, (0 split)
## bore < 3.29 to the left, agree=0.783, adj=0.5, (0 split)
## width < 66.7 to the left, agree=0.739, adj=0.4, (0 split)
##
## Node number 32: 15 observations
## mean=6093.8, MSE=337655.2
##
## Node number 33: 21 observations, complexity param=0.0003792204
## mean=7124.19, MSE=422341.6
## left son=66 (8 obs) right son=67 (13 obs)
## Primary splits:
## hp < 68.5 to the left, improve=0.4065032, (0 missing)
## bore < 3.065 to the left, improve=0.3706976, (0 missing)
## length < 160.55 to the left, improve=0.3437479, (0 missing)
## engine_size < 94.5 to the left, improve=0.3238951, (0 missing)
## stroke < 3.26 to the left, improve=0.3157922, (0 missing)
## Surrogate splits:
## high_mpg < 37.5 to the right, agree=0.952, adj=0.875, (0 split)
## engine_size < 94.5 to the left, agree=0.857, adj=0.625, (0 split)
## bore < 3.115 to the left, agree=0.857, adj=0.625, (0 split)
## height < 51.7 to the left, agree=0.810, adj=0.500, (0 split)
## stroke < 3.26 to the left, agree=0.810, adj=0.500, (0 split)
##
## Node number 34: 19 observations
## mean=8031.526, MSE=494112.2
##
## Node number 35: 8 observations
## mean=9087.125, MSE=746114.4
##
## Node number 40: 15 observations
## mean=13016.53, MSE=4564138
##
## Node number 41: 12 observations
## mean=15102.83, MSE=3716882
##
## Node number 42: 10 observations
## mean=14821.1, MSE=2143683
##
## Node number 43: 13 observations
## mean=19416, MSE=4520804
##
## Node number 66: 8 observations
## mean=6596, MSE=431192
##
## Node number 67: 13 observations
## mean=7449.231, MSE=139560.8
## var n wt dev yval complexity ncompete
## 1 engine_size 156 156 9507260033 13064.045 0.6419906639 4
## 2 weight 146 146 3229339306 11427.027 0.2203769724 4
## 4 length 89 89 442823613 8395.393 0.0223950693 4
## 8 weight 63 63 86829529 7401.762 0.0044151031 4
## 16 weight 36 36 23223916 6694.861 0.0009771390 4
## 32 <leaf> 15 15 5064828 6093.800 0.0000000000 0
## 33 hp 21 21 8869173 7124.190 0.0003792204 4
## 66 <leaf> 8 8 3449536 6596.000 0.0000000000 0
## 67 <leaf> 13 13 1814290 7449.231 0.0000000000 0
## 17 hp 27 27 21630080 8344.296 0.0006598149 4
## 34 <leaf> 19 19 9388133 8031.526 0.0000000000 0
## 35 <leaf> 8 8 5968915 9087.125 0.0000000000 0
## 9 peak_rpm 26 26 143078337 10803.038 0.0083343495 4
## 18 <leaf> 19 19 17366899 9743.421 0.0000000000 0
## 19 <leaf> 7 7 46474611 13679.143 0.0000000000 0
## 5 width 57 57 691334511 16160.632 0.0163880597 4
## 10 hp 50 50 491555445 15542.020 0.0157701476 4
## 20 weight 27 27 142082315 13943.778 0.0030521571 4
## 40 <leaf> 15 15 68462074 13016.533 0.0000000000 0
## 41 <leaf> 12 12 44602590 15102.833 0.0000000000 0
## 21 stroke 23 23 199542236 17418.217 0.0125519810 4
## 42 <leaf> 10 10 21436831 14821.100 0.0000000000 0
## 43 <leaf> 13 13 58770458 19416.000 0.0000000000 0
## 11 <leaf> 7 7 43973521 20579.286 0.0000000000 0
## 3 <leaf> 10 10 174348546 36964.500 0.0000000000 0
## nsurrogate
## 1 5
## 2 5
## 4 5
## 8 5
## 16 5
## 32 0
## 33 5
## 66 0
## 67 0
## 17 5
## 34 0
## 35 0
## 9 5
## 18 0
## 19 0
## 5 2
## 10 5
## 20 5
## 40 0
## 41 0
## 21 5
## 42 0
## 43 0
## 11 0
## 3 0
print(as.party(tree_1))
##
## Model formula:
## price ~ fuel_gas + aspiration_turbo + doors_others + doors_two +
## body_others + body_sedan + body_wagon + drive_others + drive_rwd +
## engine_loc_others + wheel_base + length + width + height +
## weight + engine_type_others + cyl_others + cyl_six + engine_size +
## fuel_sys_idi + fuel_sys_mpfi + fuel_sys_others + bore + stroke +
## compr_ratio + hp + peak_rpm + city_mpg + high_mpg + make_agg_toyota
##
## Fitted party:
## [1] root
## | [2] engine_size < 182
## | | [3] weight < 2544
## | | | [4] length < 172.7
## | | | | [5] weight < 2124
## | | | | | [6] weight < 1947: 6093.800 (n = 15, err = 5064828.4)
## | | | | | [7] weight >= 1947
## | | | | | | [8] hp < 68.5: 6596.000 (n = 8, err = 3449536.0)
## | | | | | | [9] hp >= 68.5: 7449.231 (n = 13, err = 1814290.3)
## | | | | [10] weight >= 2124
## | | | | | [11] hp < 89: 8031.526 (n = 19, err = 9388132.7)
## | | | | | [12] hp >= 89: 9087.125 (n = 8, err = 5968914.9)
## | | | [13] length >= 172.7
## | | | | [14] peak_rpm < 5350: 9743.421 (n = 19, err = 17366898.6)
## | | | | [15] peak_rpm >= 5350: 13679.143 (n = 7, err = 46474610.9)
## | | [16] weight >= 2544
## | | | [17] width < 68.6
## | | | | [18] hp < 118
## | | | | | [19] weight < 2923.5: 13016.533 (n = 15, err = 68462073.7)
## | | | | | [20] weight >= 2923.5: 15102.833 (n = 12, err = 44602589.7)
## | | | | [21] hp >= 118
## | | | | | [22] stroke >= 3.31: 14821.100 (n = 10, err = 21436830.9)
## | | | | | [23] stroke < 3.31: 19416.000 (n = 13, err = 58770458.0)
## | | | [24] width >= 68.6: 20579.286 (n = 7, err = 43973521.4)
## | [25] engine_size >= 182: 36964.500 (n = 10, err = 174348546.5)
##
## Number of inner nodes: 12
## Number of terminal nodes: 13
print(as.party(tree_0))
##
## Model formula:
## price ~ fuel_gas + aspiration_turbo + doors_others + doors_two +
## body_others + body_sedan + body_wagon + drive_others + drive_rwd +
## engine_loc_others + wheel_base + length + width + height +
## weight + engine_type_others + cyl_others + cyl_six + engine_size +
## fuel_sys_idi + fuel_sys_mpfi + fuel_sys_others + bore + stroke +
## compr_ratio + hp + peak_rpm + city_mpg + high_mpg + make_agg_toyota
##
## Fitted party:
## [1] root
## | [2] engine_size < 182
## | | [3] weight < 2544
## | | | [4] length < 172.7
## | | | | [5] weight < 2124
## | | | | | [6] weight < 1947: 6093.800 (n = 15, err = 5064828.4)
## | | | | | [7] weight >= 1947
## | | | | | | [8] hp < 68.5: 6596.000 (n = 8, err = 3449536.0)
## | | | | | | [9] hp >= 68.5: 7449.231 (n = 13, err = 1814290.3)
## | | | | [10] weight >= 2124
## | | | | | [11] hp < 89: 8031.526 (n = 19, err = 9388132.7)
## | | | | | [12] hp >= 89: 9087.125 (n = 8, err = 5968914.9)
## | | | [13] length >= 172.7
## | | | | [14] peak_rpm < 5350: 9743.421 (n = 19, err = 17366898.6)
## | | | | [15] peak_rpm >= 5350: 13679.143 (n = 7, err = 46474610.9)
## | | [16] weight >= 2544
## | | | [17] width < 68.6
## | | | | [18] hp < 118
## | | | | | [19] weight < 2923.5: 13016.533 (n = 15, err = 68462073.7)
## | | | | | [20] weight >= 2923.5: 15102.833 (n = 12, err = 44602589.7)
## | | | | [21] hp >= 118
## | | | | | [22] stroke >= 3.31: 14821.100 (n = 10, err = 21436830.9)
## | | | | | [23] stroke < 3.31: 19416.000 (n = 13, err = 58770458.0)
## | | | [24] width >= 68.6: 20579.286 (n = 7, err = 43973521.4)
## | [25] engine_size >= 182: 36964.500 (n = 10, err = 174348546.5)
##
## Number of inner nodes: 12
## Number of terminal nodes: 13
whole_data$train_2[27]
## fuel_gas aspiration_turbo doors_others doors_two body_others body_sedan
## 1: 1 0 0 1 0 0
## body_wagon drive_others drive_rwd engine_loc_others wheel_base length
## 1: 0 1 0 0 93.3 157.3
## width height weight engine_type_others cyl_others cyl_six engine_size
## 1: 63.8 55.7 2240 1 0 0 NA
## fuel_sys_idi fuel_sys_mpfi fuel_sys_others bore stroke compr_ratio hp
## 1: 0 0 0 3.62 2.64 8.7 73
## peak_rpm city_mpg high_mpg price make_agg_toyota
## 1: 4400 26 31 7603 0
predict(tree_1,whole_data$train_2[27])
## 1
## 8031.526
# another type of partitioning algorithm: Conditional Inference Tree
library(partykit)
ctree_0<-ctree(formula, data = whole_data$train)
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
print(ctree_0)
##
## Model formula:
## price ~ fuel_gas + aspiration_turbo + doors_others + doors_two +
## body_others + body_sedan + body_wagon + drive_others + drive_rwd +
## engine_loc_others + wheel_base + length + width + height +
## weight + engine_type_others + cyl_others + cyl_six + engine_size +
## fuel_sys_idi + fuel_sys_mpfi + fuel_sys_others + bore + stroke +
## compr_ratio + hp + peak_rpm + city_mpg + high_mpg + make_agg_toyota
##
## Fitted party:
## [1] root
## | [2] engine_size <= 181
## | | [3] weight <= 2540
## | | | [4] weight <= 2275
## | | | | [5] weight <= 2004: 6386.500 (n = 24, err = 12759550.0)
## | | | | [6] weight > 2004: 7794.367 (n = 30, err = 15484561.0)
## | | | [7] weight > 2275: 10288.086 (n = 35, err = 181506896.7)
## | | [8] weight > 2540
## | | | [9] cyl_six <= 0
## | | | | [10] width <= 66.9: 13823.727 (n = 22, err = 135640364.4)
## | | | | [11] width > 66.9: 16847.545 (n = 22, err = 219503325.5)
## | | | [12] cyl_six > 0: 18952.923 (n = 13, err = 104305834.9)
## | [13] engine_size > 181: 36964.500 (n = 10, err = 174348546.5)
##
## Number of inner nodes: 6
## Number of terminal nodes: 7
plot(ctree_0)

# predicting on train and test set
train_tree<-predict(tree_0)
test_tree<-predict(tree_0, newdata = whole_data$test,type = 'vector')
df_fit<-whole_data$train[, .(id=1:.N,price, train_tree)]
str(df_fit)
## Classes 'data.table' and 'data.frame': 156 obs. of 3 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 7395 6855 8949 11549 8238 ...
## $ train_tree: num 6596 6094 9743 13017 8032 ...
## - attr(*, ".internal.selfref")=<externalptr>
df_pred<-whole_data$test[, .(id=1:.N,price, test_tree)]
str(df_pred)
## Classes 'data.table' and 'data.frame': 39 obs. of 3 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 17450 18920 30760 5151 6377 ...
## $ test_tree: num 13017 20579 36964 6094 6094 ...
## - attr(*, ".internal.selfref")=<externalptr>
p1<-ggplot(melt(df_fit, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Train Prediction on Automobile Price')+
scale_colour_manual(values = c('black','red'))
p2<-ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Test Prediction on Automobile Price')+
scale_colour_manual(values = c('black','blue'))
library(gridExtra)
grid.arrange(p1,p2, ncol=1)

# Calculating the performance metrics on test set
rmse_tree<-rmse(real=whole_data$test$price, predicted = test_tree); rmse_tree
## [1] 3851.357
mae_tree<-mae(real=whole_data$test$price, predicted = test_tree); mae_tree
## [1] 2703.175
mape_tree<-mape(real=whole_data$test$price, predicted = test_tree); mape_tree
## [1] 0.1915807
# compare with train
mape(whole_data$train$price,train_tree)
## [1] 0.09251727